{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cosine similarity with other candidates of same party"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "list_year=['1962', '1967', '1968', '1973', '1978', '1981', '1993', '1997', '2017'] \n",
    "for y in range(len(list_year)):\n",
    "    year=list_year[y]\n",
    "    print(year)\n",
    "\n",
    "    df=pd.read_pickle(dir + '/data/intermediate/df_' + year)\n",
    "\n",
    "    df=df[df['tour']==1].reset_index(drop=True) #first round only\n",
    "    \n",
    "    df_all=pd.DataFrame()\n",
    "    df_all2=pd.DataFrame()\n",
    "\n",
    "    ##Word count\n",
    "    countvec = CountVectorizer(min_df=0.005) #excludes words that show up in only 0.5% of documents\n",
    "    dtm_count = pd.DataFrame(countvec.fit_transform(df.text_clean).toarray(), columns=countvec.get_feature_names(), index = df.index) #turns the matrix baxk into a readable dataset \n",
    "    dtm=dtm_count.join(df[['year', 'id_district', 'id_unique_cand', 'party']], lsuffix=\"_x\")\n",
    "\n",
    "   #mean similarity to all manifestos in party\n",
    "    ref=dtm[dtm['party']!='']\n",
    "    for cand in list(set(ref['id_unique_cand'].tolist())):\n",
    "        source=ref[ref['id_unique_cand']==cand].reset_index(drop=True)\n",
    "        p=source['party'][0]\n",
    "        target=ref[(ref['party']==p)].reset_index(drop=True)\n",
    "        if len(target)>0:\n",
    "            a=np.mean(cosine_similarity(source.iloc[:,:-4], target.iloc[:,:-4]), axis=1)\n",
    "            temp=source[['year', 'id_district', 'id_unique_cand']].join(pd.DataFrame({'mean_similarity_party':a}))\n",
    "            df_all=df_all.append(temp)         \n",
    "    \n",
    "    ##Tfidf\n",
    "    countvec = TfidfVectorizer(min_df=0.005) #excludes words that show up in only 0.5% of documents\n",
    "    dtm_count = pd.DataFrame(countvec.fit_transform(df.text_clean).toarray(), columns=countvec.get_feature_names(), index = df.index) #turns the matrix baxk into a readable dataset \n",
    "    dtm=dtm_count.join(df[['year', 'id_district', 'id_unique_cand', 'party']], lsuffix=\"_x\")\n",
    "\n",
    "   #mean similarity to all manifestos in party\n",
    "    ref=dtm[dtm['party']!='']\n",
    "    for cand in list(set(ref['id_unique_cand'].tolist())):\n",
    "        source=ref[ref['id_unique_cand']==cand].reset_index(drop=True)\n",
    "        p=source['party'][0]\n",
    "        target=ref[(ref['party']==p)].reset_index(drop=True)\n",
    "        if len(target)>0:\n",
    "            a=np.mean(cosine_similarity(source.iloc[:,:-4], target.iloc[:,:-4]), axis=1)\n",
    "            temp=source[['year', 'id_district', 'id_unique_cand']].join(pd.DataFrame({'mean_similarity_party_tf':a}))\n",
    "            df_all2=df_all2.append(temp)\n",
    "    \n",
    "    df_all=pd.merge(df_all, df_all2[['id_unique_cand', 'mean_similarity_party_tf']], on='id_unique_cand', how='outer')\n",
    "\n",
    "    df_all.to_csv(dir + '/data/intermediate/df_similarity_party_' + year + '.csv')          \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Same with bigrams"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "list_year=['1962', '1967', '1968', '1973', '1978', '1981', '1993', '1997', '2017']\n",
    "for y in range(len(list_year)):\n",
    "    year=list_year[y]\n",
    "    print(year)\n",
    "\n",
    "    df=pd.read_pickle(dir + '/data/intermediate/df_' + year)\n",
    "\n",
    "    df=df[df['tour']==1].reset_index(drop=True) #first round only\n",
    "    \n",
    "    df_all=pd.DataFrame()\n",
    "    df_all2=pd.DataFrame()\n",
    "\n",
    "    ##Word count\n",
    "    countvec = CountVectorizer(min_df=0.005, ngram_range=(2,2)) #excludes words that show up in only 1 document\n",
    "    dtm_count = pd.DataFrame(countvec.fit_transform(df.text_clean).toarray(), columns=countvec.get_feature_names(), index = df.index) #turns the matrix baxk into a readable dataset \n",
    "    dtm=dtm_count.join(df[['year', 'id_district', 'id_unique_cand', 'party']], lsuffix=\"_x\")\n",
    "\n",
    "   #mean similarity to all manifestos in party\n",
    "    ref=dtm[dtm['party']!='']\n",
    "    for cand in list(set(ref['id_unique_cand'].tolist())):\n",
    "        source=ref[ref['id_unique_cand']==cand].reset_index(drop=True)\n",
    "        p=source['party'][0]\n",
    "        target=ref[(ref['party']==p)].reset_index(drop=True)\n",
    "        if len(target)>0:\n",
    "            a=np.mean(cosine_similarity(source.iloc[:,:-4], target.iloc[:,:-4]), axis=1)\n",
    "            temp=source[['year', 'id_district', 'id_unique_cand']].join(pd.DataFrame({'mean_similarity_party':a}))\n",
    "            df_all=df_all.append(temp)       \n",
    "    \n",
    "    ##Tfidf\n",
    "    countvec = TfidfVectorizer(min_df=0.005, ngram_range=(2,2)) #excludes words that show up in only 1 document\n",
    "    dtm_count = pd.DataFrame(countvec.fit_transform(df.text_clean).toarray(), columns=countvec.get_feature_names(), index = df.index) #turns the matrix baxk into a readable dataset \n",
    "    dtm=dtm_count.join(df[['year', 'id_district', 'id_unique_cand', 'party']], lsuffix=\"_x\")\n",
    "\n",
    "   #mean similarity to all manifestos in party\n",
    "    ref=dtm[dtm['party']!='']\n",
    "    for cand in list(set(ref['id_unique_cand'].tolist())):\n",
    "        source=ref[ref['id_unique_cand']==cand].reset_index(drop=True)\n",
    "        p=source['party'][0]\n",
    "        target=ref[(ref['party']==p)].reset_index(drop=True)\n",
    "        if len(target)>0:\n",
    "            a=np.mean(cosine_similarity(source.iloc[:,:-4], target.iloc[:,:-4]), axis=1)\n",
    "            temp=source[['year', 'id_district', 'id_unique_cand']].join(pd.DataFrame({'mean_similarity_party_tf':a}))\n",
    "            df_all2=df_all2.append(temp)\n",
    "    \n",
    "    df_all=pd.merge(df_all, df_all2[['id_unique_cand', 'mean_similarity_party_tf']], on='id_unique_cand', how='outer')\n",
    "\n",
    "    df_all.to_csv(dir + '/data/intermediate/df_similarity_party_big_' + year + '.csv')          \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Latent Semantic Indexing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "list_year=['1962', '1967', '1968', '1973', '1978', '1981', '1993', '1997', '2017'] \n",
    "for y in range(len(list_year)):\n",
    "    year=list_year[y]\n",
    "    print(year)\n",
    "\n",
    "    df=pd.read_pickle(dir + '/data/intermediate/df_' + year)\n",
    "    df['year']=year\n",
    "    df=df[df['tour']==1].reset_index(drop=True) #first round only\n",
    "    \n",
    "    countvec = TfidfVectorizer(min_df=0.005) #excludes words that show up in only 1 document\n",
    "    dtm_count = pd.DataFrame(countvec.fit_transform(df.text_clean).toarray(), columns=countvec.get_feature_names(), index = df.index) #turns the matrix baxk into a readable dataset \n",
    "\n",
    "    k=200 #number of latent dimensions\n",
    "    u,s,vh=np.linalg.svd(np.matrix(dtm_count).T, full_matrices=False)\n",
    "    s[k:]=0\n",
    "    \n",
    "    m=np.dot(u, np.dot(np.diag(s), vh))  \n",
    "    assert np.linalg.matrix_rank(np.matrix(m.T))==k\n",
    "    \n",
    "    dft=pd.DataFrame(m.T).iloc[:,:k].join(df[['year', 'id_district', 'id_unique_cand', 'party']])\n",
    "    \n",
    "    df_all=pd.DataFrame()\n",
    "    #similarity to other candidates from the same party\n",
    "    ref=dft[dft['party']!='']\n",
    "    for cand in list(set(ref['id_unique_cand'].tolist())):\n",
    "        source=ref[ref['id_unique_cand']==cand].reset_index(drop=True)\n",
    "        p=source['party'][0]\n",
    "        target=ref[(ref['party']==p)].reset_index(drop=True)\n",
    "        if len(target)>0:\n",
    "            a=np.mean(cosine_similarity(source.iloc[:,:-4], target.iloc[:,:-4]), axis=1)\n",
    "            temp=source[['year', 'id_district', 'id_unique_cand']].join(pd.DataFrame({'mean_similarity_party':a}))\n",
    "            df_all=df_all.append(temp)\n",
    "\n",
    "    df_all.to_csv(dir + '/data/intermediate/df_lsi_party_' + year + '.csv')     "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## LSI with bigrams"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "list_year=['1962', '1967', '1968', '1973', '1978', '1981', '1993', '1997', '2017']\n",
    "for y in range(len(list_year)):\n",
    "    year=list_year[y]\n",
    "    print(year)\n",
    "\n",
    "    df=pd.read_pickle(dir + '/data/intermediate/df_' + year)\n",
    "    df['year']=year\n",
    "    df=df[df['tour']==1].reset_index(drop=True) #first round only\n",
    "    \n",
    "    countvec = TfidfVectorizer(min_df=0.005, ngram_range=(2,2))\n",
    "    dtm_count = pd.DataFrame(countvec.fit_transform(df.text_clean).toarray(), columns=countvec.get_feature_names(), index = df.index) #turns the matrix baxk into a readable dataset \n",
    "\n",
    "    k=200 #number of latent dimensions\n",
    "    u,s,vh=np.linalg.svd(np.matrix(dtm_count).T, full_matrices=False)\n",
    "    s[k:]=0\n",
    "    \n",
    "    m=np.dot(u, np.dot(np.diag(s), vh))  \n",
    "    assert np.linalg.matrix_rank(np.matrix(m.T))==k\n",
    "    \n",
    "    dft=pd.DataFrame(m.T).iloc[:,:k].join(df[['year', 'id_district', 'id_unique_cand', 'party']])\n",
    "    \n",
    "    df_all=pd.DataFrame()\n",
    "    #similarity to other candidates from the same party\n",
    "    ref=dft[dft['party']!='']\n",
    "    for cand in list(set(ref['id_unique_cand'].tolist())):\n",
    "        source=ref[ref['id_unique_cand']==cand].reset_index(drop=True)\n",
    "        p=source['party'][0]\n",
    "        target=ref[(ref['party']==p)].reset_index(drop=True)\n",
    "        if len(target)>0:\n",
    "            a=np.mean(cosine_similarity(source.iloc[:,:-4], target.iloc[:,:-4]), axis=1)\n",
    "            temp=source[['year', 'id_district', 'id_unique_cand']].join(pd.DataFrame({'mean_similarity_party':a}))\n",
    "            df_all=df_all.append(temp)\n",
    "\n",
    "    df_all.to_csv(dir + '/data/intermediate/df_lsi_party_big_' + year + '.csv')    "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
